| <expr> | Memory | Time | ||||||
|---|---|---|---|---|---|---|---|---|
| profvis({ | ||||||||
| library("gridExtra") #for saving png files in a specific order into pdf | ||||||||
| library("ggplot2") | ||||||||
| library('ggdendro') | ||||||||
| # library("TSclust") | ||||||||
| library("ggmap") #used to plot maps | ||||||||
| library("maps") | ||||||||
| library(scales) # for function alpha() | ||||||||
| library("compiler") # to speed up the computations! | ||||||||
| library("plyr") | ||||||||
| library("hexbin") #for hexoganal binning | ||||||||
| library("rgeos") #for creating maps | ||||||||
| library("png") #for reading png files | ||||||||
| library("grid") #for arranging png files | ||||||||
| library("data.table") #for faster creation of crosstables from data set & for faster searches of datatables; brings about a lot of speed-up! https://github.com/Rdatatable/data.table/wiki/Getting-started | ||||||||
| library("bit64") #for loading data with fread | ||||||||
| root_path <- "C:/Users/DrosoNeuro/Dropbox/UZH_Master/Masterarbeit/TwitterEpi/ExploratoryAnalysis" # defining root_path containing all relevant documents | ||||||||
| # # NB!! the BEST WAY TO CHOOSE THE BIN IS probably ***** "FD" *****: | ||||||||
| # # http://stats.stackexchange.com/questions/798/calculating-optimal-number-of-bins-in-a-histogram-for-n-where-n-ranges-from-30 | ||||||||
| # | ||||||||
| # # LOADING and MERGING DATA FRAMES ------ | ||||||||
| # | ||||||||
| # setwd(root_path) # setting WD | ||||||||
| # #function to make selection of datatable based on pre_set coordinates | ||||||||
| # | ||||||||
| # #loading files from sick patients | ||||||||
| # #see http://stackoverflow.com/questions/11433432/importing-multiple-csv-files-into-r for explanation about reading several csv-files at once | ||||||||
| # setwd("C:/Users/DrosoNeuro/Dropbox/UZH_Master/Masterarbeit/TwitterData/tweets_from_todd/csv_files/sick_csv") # temporarily set WD to folder with files from healthy Twitter users | ||||||||
| # | ||||||||
| # temp = list.files(pattern="*.csv") #read names of all .csv files | ||||||||
| # | ||||||||
| # #creates names from csv-files in folder; | ||||||||
| # names <- setNames(temp, make.names(gsub("*.csv$", "", temp))) #gsub uses regex to replace the specified patterns within a name | ||||||||
| # | ||||||||
| # #loading df into environment | ||||||||
| # list2env(lapply(names,fread, header=FALSE), envir = .GlobalEnv) #"fread" reads in the data from csv | ||||||||
| # | ||||||||
| # #create a list of all the datatables | ||||||||
| # sick_list <- lapply(attr(names,"names"),get) | ||||||||
| # | ||||||||
| # #combine into a single datatable | ||||||||
| # sick_df <- do.call("rbind",sick_list) | ||||||||
| # | ||||||||
| # remove(list = attr(names,"names"))#removing single df to save RAM | ||||||||
| # remove(sick_list)#removing sick_list to save RAM | ||||||||
| # | ||||||||
| # col_names <- c('userID','longitude','latitude','time','sick','state') | ||||||||
| # colnames(sick_df) <- col_names | ||||||||
| # setkeyv(sick_df,col_names) | ||||||||
| # alarm() | ||||||||
| # | ||||||||
| # #loading data from healthy Twitter users | ||||||||
| # setwd("C:/Users/DrosoNeuro/Dropbox/UZH_Master/Masterarbeit/TwitterData/tweets_from_todd/csv_files/one_hundred_csv") # temporarily set WD to folder with files from healthy Twitter users | ||||||||
| # temp = list.files(pattern="*.csv") #read names of all .csv files | ||||||||
| # | ||||||||
| # #creates names from csv-files in folder; | ||||||||
| # names <- setNames(temp, make.names(gsub("*.csv$", "", temp))) #gsub uses regex to replace the specified patterns within a name | ||||||||
| # | ||||||||
| # #loading df into environment | ||||||||
| # list2env(lapply(names,fread, header=FALSE), envir = .GlobalEnv) | ||||||||
| # | ||||||||
| # #create a list of all the datatables | ||||||||
| # healthy_list <- lapply(attr(names,"names"),get) | ||||||||
| # | ||||||||
| # #combine into a single datatable | ||||||||
| # healthy_df <- do.call("rbind",healthy_list) | ||||||||
| # | ||||||||
| # remove(list = attr(names,"names"))#removing single df to save RAM | ||||||||
| # remove(healthy_list)#removing sick_list to save RAM | ||||||||
| # remove(list= c("names","temp")) | ||||||||
| # | ||||||||
| # colnames(healthy_df) <- col_names | ||||||||
| # setkeyv(healthy_df, col_names) #sets key to column "userID" | ||||||||
| # remove(col_names) | ||||||||
| # alarm() | ||||||||
| # | ||||||||
| # setwd(root_path) # set WD back | ||||||||
| # | ||||||||
| # save.image(file="Twitter_datatables.RData") #saving loaded datatable to prevent loading it from the excel-files the next time | ||||||||
| # EXPLORATORY DATA ANALYSIS ------ | ||||||||
| setwd(root_path) # set WD back | ||||||||
| load(file="Twitter_datatables.RData") #if the code above has been executed once, you can uncomment it and start directly from here | ||||||||
| title_plot<-"All tweets from " #generic title plot used in some functions | ||||||||
| # to_analyse <- "healthy_df" | ||||||||
| # rm(list=setdiff(ls(), to_analyse)) #removes all entries from workspace except for the datatable that shall be analysed | ||||||||
| # | ||||||||
| #funtion to make selection of datatable based on coordinate (lon_west,lon_est,lat_south,lat_north) | ||||||||
| coord_selection <- function(datatable,coord_selec) | ||||||||
| { | ||||||||
| selec <- datatable[datatable[,longitude >=coord_selec[1] & longitude <= coord_selec[2] & latitude >= coord_selec[3] & latitude <=coord_selec[4]],] | ||||||||
| #selec <- datatable[which(datatable[,"longitude",]>=coord_selec[1] & datatable[,"longitude"] <= coord_selec[2] & datatable[,"latitude"] >= coord_selec[3] & datatable[,"latitude"] <= coord_selec[4]),] #old way to do it with dataframes | ||||||||
| } | ||||||||
| #function to make selection of datatable based on coordinate (lon_west,lon_est,lat_south,lat_north); also returns index | ||||||||
| coord_selection2 <- function(datatable,coord_selec) # | ||||||||
| { | ||||||||
| selec <- datatable[datatable[,longitude >=coord_selec[1] & longitude <= coord_selec[2] & latitude >= coord_selec[3] & latitude <=coord_selec[4]],] | ||||||||
| index <-datatable[,longitude >=coord_selec[1] & longitude <= coord_selec[2] & latitude >= coord_selec[3] & latitude <=coord_selec[4]] | ||||||||
| return(list(selec,index)) | ||||||||
| } | ||||||||
| coord_USA <- c(-125,-66,25,50) #select only tweets from mainland USA | ||||||||
| sick_df <- coord_selection(sick_df, coord_USA) | ||||||||
| healthy_df <- coord_selection(healthy_df,coord_USA) | ||||||||
| explore_data <- function(datatable,sickness_state){ #"sickness_state" takes values "sick" or "healthy" and signifies the state that the users represented in the dataste *should* be in | ||||||||
| all_users<-unique(datatable[,userID]) #unique returns a vector, data frame or array like x but with duplicate elements/rows removed; in this case = unique return of user_ID | ||||||||
| num_users <- length(all_users) | ||||||||
| sick_position <- which(datatable[,sick]==1) #gets position of tweets labelled as asick | ||||||||
| num_sick_tweets<-sum(datatable[,sick]==1) #returns number of tweets that are labelled as "sick" | ||||||||
| #sick_tweets<-datatable[sick_position,5] #returns entries that are labelled as sick | ||||||||
| # NB! n sick tweets != n sick users!!! | ||||||||
| sick_users<-unique(datatable[sick_position,userID]) | ||||||||
| num_sick_users <- length(sick_users) | ||||||||
| #getting healthy | ||||||||
| healthy_position <- which(datatable[,sick]==0) | ||||||||
| #healthy_tweets <- unique(datatable[healthy_position,5]) #returns entries that are labelled as healthy | ||||||||
| num_healthy_tweets <- sum(datatable[,sick]==0) | ||||||||
| healthy_users <- unique(datatable[healthy_position,userID]) #getting healthy users | ||||||||
| num_healthy_users <- length(healthy_users) | ||||||||
| #check the total number of false labels | ||||||||
| if (sickness_state == "sick"){ #checking whether there are any users in a "sick" datatable that have never been sick, i.e. that healthy_users that don't show up in sick_users | ||||||||
| false_label <- healthy_users[!(healthy_users %in% sick_users)] | ||||||||
| num_false_label <- length(false_label) | ||||||||
| } | ||||||||
| else if (sickness_state == "healthy"){ | ||||||||
| false_label <- sick_users | ||||||||
| num_false_label <- num_sick_users | ||||||||
| } | ||||||||
| out <- list(all_users,num_users,sick_position,num_sick_tweets,sick_users,num_sick_users,healthy_position,num_healthy_tweets,healthy_users,num_healthy_users,false_label,num_false_label) | ||||||||
| names(out) <- c("all_users","num_users", "sick_position","num_sick_tweets","sick_users","num_sick_users","healthy_position", "num_healthy_tweets", "healthy_users","num_healthy_users","false_label","num_false_label") | ||||||||
| return(out) | ||||||||
| } | ||||||||
| #get preliminary info from datatables | ||||||||
| explore_sick <- explore_data(sick_df,"sick") | ||||||||
| str(explore_sick) | ||||||||
| explore_sick <- list(explore_sick$false_label) #prune list to save memory | ||||||||
| names(explore_sick) <- "false_label" | ||||||||
| explore_healthy <- explore_data(healthy_df,"healthy") | ||||||||
| str(explore_healthy) | ||||||||
| explore_healthy <- list(explore_healthy$false_label) #reduce size of list to save memory | ||||||||
| names(explore_healthy) <- "false_label" | ||||||||
| # ---------------- here we analyse the data `in space' ------------ | ||||||||
| # # tweets on maps using scatterplots---- | ||||||||
| # | ||||||||
| # plot_location <- function(datatable,explore,tag) | ||||||||
| # { | ||||||||
| # dir.create("img_tmp") #create new directory to story images | ||||||||
| # setwd(paste0(root_path,"/img_tmp")) | ||||||||
| # ###set-up### | ||||||||
| # #function print spatial distribution of sick tweets | ||||||||
| # print_map <- function(my_map, my_title, my_coord, colo="red",pt_size=0.5){ | ||||||||
| # ggmap(my_map)+ggtitle(my_title) + geom_point(aes_string(my_coord[,"longitude"],my_coord[,"latitude"]),color=colo,data=my_coord,alpha=.3,size=pt_size) | ||||||||
| # } | ||||||||
| # | ||||||||
| # #prune datatable to save memory | ||||||||
| # datatable <- datatable[,c(1:3,5)] | ||||||||
| # | ||||||||
| # #define pt-size | ||||||||
| # pt_size_USA <- 1.5 | ||||||||
| # pt_size_local <- 7.5 | ||||||||
| # | ||||||||
| # #create names for each map | ||||||||
| # img_names <- c("sicktweets","sicktweets_local","healthytweets","healthytweets_local","mislabelledtweets","mislabelledtweets_local") | ||||||||
| # | ||||||||
| # filenames <- paste(img_names,tag,sep="_") | ||||||||
| # filenames <- paste(filenames,"png",sep=".") | ||||||||
| # | ||||||||
| # #define size of each image (in inchex) | ||||||||
| # img_size <- 20 | ||||||||
| # | ||||||||
| # ###create maps with tweets for the whole continent### | ||||||||
| # my_ggmap<-get_map(location = "Kansas",zoom = 3,source = "google", maptype = "terrain",color = 'bw') # my_ggmap<- map_data('world'); downloading map with focus on Kansas, zoom = 3 (continent) | ||||||||
| # | ||||||||
| # ##print all tweets labelled as "sick" in the whole USA## | ||||||||
| # tot_sick <- datatable[datatable[,"sick"]==1,] #subset of all tweets labelled as "sick" | ||||||||
| # | ||||||||
| # #png(filename=filenames[1],width=2000,height=2000) #open pdf to save images | ||||||||
| # p <- print_map(my_ggmap,my_title="Sick tweets",my_coord=tot_sick,colo="red",pt_size=pt_size_USA) | ||||||||
| # p <- print | ||||||||
| # ggsave(file=filenames[1],width=img_size,height=img_size) | ||||||||
| # #dev.off() #close pdf | ||||||||
| # | ||||||||
| # remove(list=c("p","tot_sick")) #remove map created and datatable to save memory | ||||||||
| # | ||||||||
| # ##print all tweets labelled as "healthy" in the whole USA## | ||||||||
| # tot_healthy <- datatable[datatable[,"sick"]==0,] | ||||||||
| # | ||||||||
| # p <- print_map(my_ggmap,my_title="Healthy tweets",my_coord=tot_healthy,colo="blue",pt_size=pt_size_USA) | ||||||||
| # ggsave(file=filenames[3],width=img_size,height=img_size) | ||||||||
| # | ||||||||
| # remove(list=c("p","tot_healthy")) #remove map created and datatable to save memory | ||||||||
| # | ||||||||
| # ##print all tweets mislabelled in the whole USA## | ||||||||
| # tot_mislabelled <- datatable[which(datatable[,"userID"] %in% explore$false_label),] #gets all tweets from those users who are in the wrong category ("sick" users in the "healthy" datatable or "healthy" users in the "sick" data set) | ||||||||
| # p <- print_map(my_ggmap,my_title="Mislabelled Tweets",my_coord=tot_mislabelled,col="deeppink",pt_size=pt_size_USA) | ||||||||
| # ggsave(file=filenames[5],width=img_size,height=img_size) | ||||||||
| # | ||||||||
| # remove(list=c("p","tot_mislabelled","my_ggmap")) #remove map created and datatable to save memory; also remove ggmap of USA since it won't be used afterwards | ||||||||
| # | ||||||||
| # ###create local maps with tweets### | ||||||||
| # #create google-map for a certain region | ||||||||
| # my_ggmap_local <- get_map(location = "NewYork", zoom= 8, source="google",maptype = "terrain",color = "bw")#local map | ||||||||
| # coord_local <- as.numeric(attr(my_ggmap_local,"bb"))#get coordinates of map | ||||||||
| # coord_local <- c(coord_local[2],coord_local[4],coord_local[1],coord_local[3]) #reorder coordinates to make it work for function coord_selection | ||||||||
| # | ||||||||
| # local_selection <- coord_selection2(datatable,coord_local) #select only those values which are within the local map region | ||||||||
| # | ||||||||
| # local_index <- local_selection[[2]] | ||||||||
| # local_selection <- local_selection[[1]] | ||||||||
| # remove(datatable) #to save memory | ||||||||
| # | ||||||||
| # ##print all tweets labelled as "sick" in the selected region## | ||||||||
| # local_sick <- local_selection[local_selection[,"sick"]==1,] #get all local tweets labelled as "sick" | ||||||||
| # #png(filename=filenames[2],width=2000,height=2000) | ||||||||
| # p <- print_map(my_ggmap_local,my_title = "Sick tweets", local_sick,"red",pt_size=pt_size_local) | ||||||||
| # ggsave(file=filenames[2],width=img_size,height=img_size) | ||||||||
| # #dev.off() #close pdf | ||||||||
| # | ||||||||
| # remove(list=c("p","local_sick")) #remove map created and datatable to save memory; | ||||||||
| # | ||||||||
| # ##print all tweets labelled as "healthy" in the selected region## | ||||||||
| # local_healthy <- local_selection[local_selection[,"sick"]==0,] | ||||||||
| # | ||||||||
| # p <-print_map(my_ggmap_local,my_title="Healthy tweets",local_healthy,colo="blue",pt_size=pt_size_local) | ||||||||
| # ggsave(file=filenames[4],width=img_size,height=img_size) | ||||||||
| # | ||||||||
| # remove(list=c("p","local_healthy")) #remove map created and datatable to save memory | ||||||||
| # | ||||||||
| # #print all tweets labelled as "healthy" in the selected region## | ||||||||
| # local_mislabelled <- local_selection[which(local_selection[,"userID"] %in% explore$false_label[local_index]),] | ||||||||
| # | ||||||||
| # p <- print_map(my_ggmap_local,"Mislabelled Tweets",my_coord=local_mislabelled,col="deeppink",pt_size=pt_size_local) | ||||||||
| # ggsave(file=filenames[6],width=img_size,height=img_size) | ||||||||
| # | ||||||||
| # remove(list=c("p","local_mislabelled","my_ggmap_local")) #remove map created and datatable to save memory; also delete local google-map | ||||||||
| # | ||||||||
| # #reading all created images | ||||||||
| # plots <- lapply(ll <- list.files(patt='.*[.]png'),function(x){ | ||||||||
| # img <- as.raster(readPNG(x)) | ||||||||
| # grid::rasterGrob(img, interpolate = FALSE) | ||||||||
| # }) | ||||||||
| # #changing wd to root_path and saving images in single pdf | ||||||||
| # setwd(paste0(root_path)) | ||||||||
| # ggsave(paste0("/plots/","scatterplots_",tag,".pdf"), marrangeGrob(grobs=plots, nrow=1, ncol=1)) | ||||||||
| # unlink("img_tmp",recursive=T) #delete directory with png-images | ||||||||
| # } | ||||||||
| # | ||||||||
| # plot_location(sick_df,explore_sick,"sick_df") | ||||||||
| # plot_location(healthy_df,explore_healthy,"healthy_df") | ||||||||
| # | ||||||||
| #tweets on maps using hexbin plots---- | ||||||||
| plot_location_hexbin <- function(datatable,explore,tag) #function print spatial distribution of sick tweets | ||||||||
| { | ||||||||
| ###set-up### | ||||||||
| #datatable <- datatable[,.(userID,longitude,latitude,sick)] #prune datatable to make it smaller > doesn't help to free up memory > actually *uses* memory | ||||||||
| coord_cont <- c(-125,-66,25,50) #select only tweets from mainland USA | ||||||||
| coord_local <- c(-80,-66,38,43) #select only tweets on the East Coast | ||||||||
| #set the colors, number intervals, interval location | ||||||||
| cr <- colorRampPalette(c("green","blue")) | ||||||||
| #number of bins to be use | ||||||||
| xbins = 100 | ||||||||
| #create titles for each map | ||||||||
| img_names <- c("alltweets_cont","sicktweets_cont","healthytweets_cont","mislabelled_cont","alltweets_local", "sicktweets_local", "healthytweets_local","mislabelledtweets_local") | ||||||||
| filenames <- paste(img_names,tag,sep="_") | ||||||||
| # #create a list to store plot; only needed if you want to record plot with recordPlot() | ||||||||
| # num.plots <- 8 | ||||||||
| # my.plots <- vector(num.plots, mode='list') | ||||||||
| #create a list to store hexbins | ||||||||
| num.plots <- 8 | ||||||||
| my.bins <- vector(num.plots,mode="list") | ||||||||
| # create hexbins for continent data | ||||||||
| continent <- coord_selection(datatable,coord_cont) | ||||||||
| my.bins[[1]] <- hexbin(continent$longitude,continent$latitude,xbins=xbins,IDs=T) | ||||||||
| temp <- continent[continent[,sick]==1,] | ||||||||
| my.bins[[2]] <- hexbin(temp$longitude,temp$latitude,xbins=xbins,IDs=T) | ||||||||
| temp <- continent[continent[,sick]==0,] | ||||||||
| my.bins[[3]] <- hexbin(temp$longitude,temp$latitude,xbins=xbins,IDs=T) | ||||||||
| temp <- continent[which(continent[,userID] %in% explore$false_label),] | ||||||||
| my.bins[[4]] <- hexbin(temp$longitude,temp$latitude,xbins=xbins,IDs=T) | ||||||||
| remove(continent) #to save memory | ||||||||
| gc() #garbage collection | ||||||||
| # create hexbins for local data | ||||||||
| local <- coord_selection(datatable,coord_local) | ||||||||
| my.bins[[5]] <- hexbin(local$longitude,local$latitude,xbins=xbins,IDs=T) | ||||||||
| temp <- local[local[,sick]==1,] | ||||||||
| my.bins[[6]] <- hexbin(temp$longitude,temp$latitude,xbins=xbins,IDs=T) | ||||||||
| temp <- local[local[,sick]==0,] | ||||||||
| my.bins[[7]] <- hexbin(temp$longitude,temp$latitude,xbins=xbins,IDs=T) | ||||||||
| temp <- local[which(local[,userID] %in% explore$false_label),] | ||||||||
| my.bins[[8]] <- hexbin(temp$longitude,temp$latitude,xbins=xbins,IDs=T) | ||||||||
| remove(local) #to save memory | ||||||||
| gc() #garbage collection | ||||||||
| ###create maps with tweets for the whole continent### | ||||||||
| ##print all tweets in the whole USA## | ||||||||
| pdf(paste0("plots/",'HexbinPlots_',tag,'.pdf'), onefile=TRUE) | ||||||||
| for (i in 1:num.plots) | ||||||||
| { | ||||||||
| my.bins[[i]]@count <- log(my.bins[[i]]@count)#log-transforming counts in order to improve readability | ||||||||
| gplot.hexbin(my.bins[[i]],style="colorscale",pen=0,border= 'white', minarea = 0.01, maxarea = 1,colramp=cr,legend=1.5,mincnt=0, xlab="longitude",ylab="latitude",main=filenames[i],colorcut=seq(0,1,length=10)) | ||||||||
| } | ||||||||
| graphics.off() | ||||||||
| remove(list=c("my.bins")) | ||||||||
| } | ||||||||
| plot_location_hexbin(sick_df,explore_sick,"sick_df") | ||||||||
| plot_location_hexbin(healthy_df,explore_healthy,"healthy_df") | ||||||||
| # histogram of longitude and latitude ---- | ||||||||
| hist_coord <- function(datatable, tag,explore){ | ||||||||
| #create filenames# | ||||||||
| filenames <- c("all_tweets","all_tweets","sicktweets","sicktweets","healthytweets","healthytweets","mislabelled","mislabelled") | ||||||||
| filenames <- paste(filenames,tag,sep="_") | ||||||||
| filenames <- paste(filenames,c("lon","lat")) | ||||||||
| #define root for transformation | ||||||||
| root <- 1/2 | ||||||||
| root_tag <- as.character(round(root,2)) | ||||||||
| #prune datatable to save memory | ||||||||
| datatable <- datatable[,.(userID,longitude,latitude,sick)] | ||||||||
| #create a list to store hexbins | ||||||||
| num.plots <- 8 | ||||||||
| my.subsets <- vector(num.plots,mode="list") | ||||||||
| # create subsets of datatable for analysis | ||||||||
| my.subsets[[1]] <- datatable | ||||||||
| my.subsets[[2]] <- datatable[datatable[,sick]==1,] | ||||||||
| my.subsets[[3]] <- datatable[datatable[,sick]==0,] | ||||||||
| my.subsets[[4]] <- datatable[which(datatable[,userID] %in% explore$false_label),] | ||||||||
| remove(datatable) #to save memory | ||||||||
| ##create histogram of whole datatable## | ||||||||
| pdf(file=paste0("plots/","HistogramOfCoordinates_",tag,".pdf"),onefile=T,width=20) | ||||||||
| par(mfrow=c(1,2)) | ||||||||
| for (i in 1:(num.plots/2)){ | ||||||||
| h <- hist(my.subsets[[1]][,longitude], breaks = length(unique(my.subsets[[1]][,longitude])),plot=F) #save histdata | ||||||||
| h$counts <- (h$counts)**root | ||||||||
| plot(h,xlab="longitude",ylab=paste0("(frequency)^",root_tag), main=filenames[(2*i)-1]) | ||||||||
| remove(list =c("h")) | ||||||||
| h <- hist(my.subsets[[1]][,latitude], breaks = length(unique(my.subsets[[1]][,latitude])),plot=F) #save histdata | ||||||||
| h$counts <- (h$counts)**root | ||||||||
| plot(h,xlab="latitude",ylab=paste0("(frequency)^",root_tag), main=filenames[(2*i)]) | ||||||||
| remove(list =c("h")) | ||||||||
| my.subsets <- my.subsets[-1] #removing subset that was just used to save memory | ||||||||
| gc() | ||||||||
| } | ||||||||
| dev.off() | ||||||||
| } | ||||||||
| hist_coord(sick_df, "sick_df",explore_sick) | ||||||||
| hist_coord(healthy_df,"healthy_df",explore_healthy) | ||||||||
| ### ---------------- here we analyse the user/state activity ------------ | ||||||||
| ##hist of US states activity ---- | ||||||||
| #state_abbr = c("dc","as","gu","mp","vi","pr","hi","ak","ct","me","ma","nh","ri","vt","nj","ny","de","md","pa","va","wv","al","fl","ga","ky","ms","nc","sc","tn","il","in","mi","mn","oh","wi","ar","la","nm","ok","tx","ia","ks","mo","ne","co","mt","nd","sd","ut","wy","az","ca","nv","id","or","wa") | ||||||||
| #state_names = c("district of columbia","samoa","guam","northern mariana islands","virgin islands","puerto rico","hawaii","alaska","connecticut","maine","massachusetts","new hampshire","rhode island","vermont","new jersey","new york","delaware","maryland","pennsylvania","virginia","west virginia","alabama","florida","georgia","kentucky","mississippi","north carolina","south carolina","tennessee","illinois","indiana","michigan","minnesota","ohio","wisconsin","arkansas","louisiana","new mexico","oklahoma","texas","iowa","kansas","missouri","nebraska","colorado","montana","north dakota","south dakota","utah","wyoming","arizona","california","nevada","idaho","oregon","washington") | ||||||||
| # p <- ggplot(data = states_activity, aes(x = states_activity$freq))+ | ||||||||
| # geom_histogram( ) + ggtitle(paste0(title_plot,tag,'- States activity'))+ | ||||||||
| # xlab('numb. of tweets') + ylab("num. of states") + theme_bw()+scale_y_discrete() | ||||||||
| #function that takes datatable and plot histogram of tweet activity with regard to state and vice versa | ||||||||
| hist_states <- function(datatable,tag,title_plot = "All tweets from "){ | ||||||||
| num_states<-length(unique(datatable[,state])) | ||||||||
| states_activity<- datatable[,.N,by=.(state)] | ||||||||
| colnames(states_activity) <- c("state","freq") | ||||||||
| filenames <- paste0("plots/","histogram_states_",tag,".pdf") | ||||||||
| pdf(file=filenames,width=14) | ||||||||
| par(mfrow=c(1,2)) | ||||||||
| hist(states_activity$freq, breaks = "FD",main= paste0(title_plot,tag,' - States activity'), xlab = 'numb. of tweets', ylab = "num. of states") | ||||||||
| barplot(as.array(states_activity$freq),names.arg=states_activity$state,ylab="num. of tweets",xlab = "states",main=paste0(title_plot,tag,' States activity')) | ||||||||
| dev.off() | ||||||||
| } | ||||||||
| hist_states(sick_df, "sick_df") | ||||||||
| hist_states(healthy_df,"healthy_df") | ||||||||
| ## plot histogram of User ID ---- | ||||||||
| user_activity <- function(datatable,tag){#datatable has to be in the form of a data.table; preferentially with key already set to "userID" | ||||||||
| setkey(datatable,"userID") | ||||||||
| user_ac <- datatable[,.N,by=.(userID)] #".N" is a shortcut for length(current_object), in this case, it outputs the nunber of occurences of each user in the column userID; .() is a shorthand for "list" | ||||||||
| filenames <- paste0("plots/","user_activity_",tag,".pdf") | ||||||||
| pdf(file=filenames) | ||||||||
| activity_plot <- ggplot(data = user_ac, aes(x = user_ac[,N]))+ | ||||||||
| geom_histogram( ) + ggtitle(paste0('user activity_',tag))+ | ||||||||
| xlab('numb. of tweets') + ylab("num. of users") + theme_bw() | ||||||||
| print(activity_plot) | ||||||||
| dev.off() | ||||||||
| } | ||||||||
| user_activity(sick_df,"sick_df") | ||||||||
| user_activity(healthy_df,"healthy_df") | ||||||||
| }) #end of profvis | ||||||||
| Code | File | Memory (MB) | Time (ms) | |||
|---|---|---|---|---|---|---|
user_activity | <expr> | -3274.8 | 2185.6 | 5160 | ||
hist_states | <expr> | 0 | 1125.8 | 1100 | ||
hist_coord | <expr> | -18374.3 | 17937.8 | 51800 | ||
plot_location_hexbin | <expr> | -18604.0 | 16564.2 | 27160 | ||
str | <expr> | 0 | 54.2 | 50 | ||
explore_data | <expr> | -5182.3 | 5546.4 | 3870 | ||
coord_selection | <expr> | -3176.5 | 3360.2 | 8490 | ||
library | <expr> | 0 | 2.0 | 160 | ||